#############
#############
#############
## Stefan Müller and Sven-Oliver Proksch: 
## Nostalgia in European Party Politics:
## A Text-Based Measurement Approach
## British Journal of Political Science
##
## Script returns all tables and plots 
## reported in SI Section C
## (Figure A3, Table A2, Table A3, Table A4, Figure A4, Figure A5, Figure A6)
## 
## Note: Table A5, Table A6, Table A7, Table A15, Figure A12, Figure A13
## are created in 01_apply_classifiers_and_text_descriptives.R
## since they require sentence-level data
#############
#############
#############

library(dplyr)               # CRAN v1.1.2 
library(ggplot2)             # CRAN v3.4.2 
library(xtable)              # CRAN v1.8-4
library(stringr)             # CRAN v1.5.0
library(tidyr)               # CRAN v1.3.0
library(ggcorrplot)          # CRAN v0.1.4
library(ggbeeswarm)          # CRAN v0.7.2
library(cowplot)             # CRAN v1.1.1
library(tidycomm)            # CRAN v0.2.1
library(readr)               # CRAN v2.1.4
library(mltest)              # CRAN v1.0.1
library(quanteda)            # CRAN v3.3.1
library(quanteda.textmodels) # CRAN v0.9.6
library(caret)               # CRAN v6.0-94

# If the code does not run, one or more packages may have been 
# updated, which may result in errors or conflicts. You can solve this issue
# by installing the package version listed above or by using the 
# groundhog package:
# after installing groundhog using install.packages("groundhog")
# change library(name_of_package) to
# groundhog::groundhog.library(name_of_package, date = "2023-09-04")
# Instead of adjusting the library() function for each package, 
# you can adjust them at all once using the
# the following syntax:
# groundhog.library("
#                   library('pkgA')
#                   library('pkgB')
#                   library('pkgC')", date = "2023-09-04")
# More details are available at: https://groundhogr.com/using/

# function for custom ggplot2 scheme
source("function_theme_base.R")

# load spreadsheet that includes the hand-coding of sentences
# by our four RAs
dat_3_joined <- read_csv("data_coded_round_3.csv")

table(dat_3_joined$translation_coder01)
table(dat_3_joined$translation_coder02)
table(dat_3_joined$translation_coder03)
table(dat_3_joined$translation_coder04)

# calculate translation accuracy statistics
dat_3_joined <- dat_3_joined |> 
    mutate(translation_agreemeent_coders = 
               translation_coder02 + translation_coder01 + translation_coder04 + translation_coder03) |> 
    mutate(translation_at_least_1 = ifelse(translation_agreemeent_coders >= 1, 1, 0)) |> 
    mutate(translation_at_least_2 = ifelse(translation_agreemeent_coders >= 2, 1, 0)) |> 
    mutate(translation_at_least_3 = ifelse(translation_agreemeent_coders >= 3, 1, 0)) |> 
    mutate(translation_at_least_4 = ifelse(translation_agreemeent_coders >= 4, 1, 0))


table(dat_3_joined$translation_at_least_1)

table(dat_3_joined$translation_at_least_2)

table(dat_3_joined$translation_at_least_3)

table(dat_3_joined$translation_at_least_4)

# chnage values to integer
dat_3_joined <- dat_3_joined |> 
    mutate(nostalgic_coder01 = as.integer(nostalgic_coder01),
           nostalgic_coder02 = as.integer(nostalgic_coder02),
           nostalgic_coder03 = as.integer(nostalgic_coder03),
           nostalgic_coder04 = as.integer(nostalgic_coder04)) |> 
    mutate(nostalgia_agreemeent_coders = 
               nostalgic_coder02 + nostalgic_coder03 + 
               nostalgic_coder01 + nostalgic_coder04)

# check how many coders assigned a sentence as nostalgic
dat_3_joined <- dat_3_joined |> 
    mutate(nostalgic_at_least_1 = ifelse(nostalgia_agreemeent_coders >= 1, 1, 0)) |> 
    mutate(nostalgic_at_least_2 = ifelse(nostalgia_agreemeent_coders >= 2, 1, 0)) |> 
    mutate(nostalgic_at_least_3 = ifelse(nostalgia_agreemeent_coders >= 3, 1, 0)) |> 
    mutate(nostalgic_at_least_4 = ifelse(nostalgia_agreemeent_coders >= 4, 1, 0))


dat_at_least_3 <- dat_3_joined |> 
    filter(nostalgic_at_least_3 == 1)

nrow(dat_at_least_3)

table(dat_at_least_3$cultural_economic_nostalgia_coder02)


dat_3_joined_subset <- dat_3_joined |>
    select(nostalgic = nostalgic_at_least_3, 
           starts_with("nostalgic_at_"),
           text, countryname, doc_id, 
           translation_inaccurate = translation_at_least_2)

# training set = 80% of sentences
0.8 * nrow(dat_3_joined_subset)


# reshuffle order randomly (to avoid that test set are the last X coded sentences)
set.seed(235)
rows <- sample(nrow(dat_3_joined_subset))

dat_3_joined_subset_random <- dat_3_joined_subset[rows, ]


dat_3_joined_train <- dat_3_joined_subset_random[1:(0.8 * nrow(dat_3_joined_subset_random)), ]
nrow(dat_3_joined_train)

dat_3_joined_test <- dat_3_joined_subset_random[((0.8 * nrow(dat_3_joined_subset_random)) + 1):nrow(dat_3_joined_subset_random), ]
nrow(dat_3_joined_test)

# save for BERT classification
# write_csv(dat_3_joined_train, "data_coded_rr/data_coded_train.csv")
# write_csv(dat_3_joined_test, "data_coded_rr/data_coded_test.csv")
# write_csv(dat_3_joined_subset_random, "data_coded_rr/data_coded_all.csv")


dat_3_joined_long <- dat_3_joined |> 
    select(nostalgia_dictionary, nostalgia_dictionary_emb, starts_with("nostalgic")) |> 
    gather(coder, coding, -c(nostalgia_dictionary, nostalgia_dictionary_emb)) |> 
    mutate(coder = str_remove_all(coder, "nostalgic_")) |> 
    mutate(coder = str_to_title(coder))


# get intercoder reliability statistics across the four coders

# https://cran.r-project.org/web/packages/tidycomm/vignettes/v04_icr.html
dat_long_codes <- dat_3_joined |> 
    select(starts_with("nostalgic_"), doc_id) |>
    select(-starts_with("nostalgic_at_least")) |> 
    gather(coder_id, nostalgic, -doc_id) |> 
    rename(post_id = doc_id) 

table(dat_long_codes$coder_id)

# get agreement statistics
agree <- dat_long_codes |> 
    test_icr(post_id, coder_id, 
             fleiss_kappa = TRUE,
             na.omit = TRUE,
             agreement = TRUE,
             levels = "nominal") |> 
    as.data.frame()

# get number of sentences
sentences <- length(unique(dat_long_codes$post_id))

# create table
tab_reliability <- agree |> 
    mutate(Sentences = sentences,
           Agreement = paste0(round(Agreement * 100, 1), "%")) |> 
    dplyr::select(Coders = n_Coders, 
                  Sentences,
                  Categories = n_Categories,
                  Agreement,
                  `Krippendorff's Alpha`  = Krippendorffs_Alpha,
                  `Fleiss' Kappa` = Fleiss_Kappa)

# Table A02 ----
print(xtable(tab_reliability,
             caption = "Inter-coder reliability",
             digits = 3,
             align = c("lllllll"), 
             label ="tab:intercoder"),
      type = "latex",
      size = "small",
      align = c("lllllll"), 
      file = "tab_a02.tex",
      include.rownames = FALSE,
      caption.placement = "top")


# prepare test and training set
# load hand-coded PolNos data
# dat_handcoding <- read_csv("data_polnos_handcoding.csv")

# dat_handcoding_subset <- dat_handcoding |>
#     select(nostalgic = nostalgic_at_least_3, 
#            starts_with("nostalgic_at_"),
#            text, countryname, doc_id, 
#            translation_inaccurate = translation_at_least_2)
# 
# # training set = 80% of sentences
# 0.8 * nrow(dat_handcoding_subset)
# 
# 
# # reshuffle order randomly (to avoid that test set are the last X coded sentences)
# set.seed(235)
# rows <- sample(nrow(dat_handcoding_subset))
# 
# dat_handcoding_subset_random <- dat_handcoding_subset[rows, ]
# 
# 
# dat_handcoding_train <- dat_handcoding_subset_random[1:(0.8 * nrow(dat_handcoding_subset_random)), ]
# nrow(dat_handcoding_train)
# 
# dat_handcoding_test <- dat_handcoding_subset_random[((0.8 * nrow(dat_handcoding_subset_random)) + 1):nrow(dat_handcoding_subset_random), ]
# nrow(dat_handcoding_test)

# save for DistilBERT classification and validation (01b_fine-tune_distilbert)
# write_csv(dat_handcoding_train, "data_coded_train_clean.csv")
# write_csv(dat_handcoding_test, "data_coded_test_clean.csv")
# write_csv(dat_handcoding_subset_random, "data_coded_all_clean.csv")


# dat_handcoding_long <- dat_handcoding |> 
#     select(nostalgia_dictionary, nostalgia_dictionary_emb, starts_with("nostalgic")) |> 
#     gather(coder, coding, -c(nostalgia_dictionary, nostalgia_dictionary_emb)) |> 
#     mutate(coder = str_remove_all(coder, "nostalgic_")) |> 
#     mutate(coder = str_to_title(coder))

# load dictionary
data_dictionary_nostalgia <- readRDS("dictionary_full.rds")

# load training and test sets created above 
# (same as training and test sets for Python-based DistilBERT classifier)
dat_train <- read.csv("data_coded_train_clean.csv",
                      fileEncoding = "utf-8",
                      stringsAsFactors = FALSE)

dat_test <- read.csv("data_coded_test_clean.csv",
                     fileEncoding = "utf-8",
                     stringsAsFactors = FALSE)


# create tokens objects for test and training set
toks_train <- dat_train |> 
    corpus(text_field = "text") |> 
    tokens()


toks_test <- dat_test |> 
    corpus(text_field = "text") |> 
    tokens()

# dictionary classification and aggregation

dat_dict_test <- toks_test |> 
    tokens_lookup(dictionary = data_dictionary_nostalgia) |> 
    dfm() |> 
    quanteda::convert(to = "data.frame")


# estimate sentiment
dat_sent <- toks_test |> 
    tokens_lookup(dictionary = data_dictionary_LSD2015,
                  nested_scope = "dictionary") |> 
    dfm() |> 
    quanteda::convert(to = "data.frame") |> 
    mutate(sentiment = log((positive + neg_negative + 0.5) / (negative + neg_positive + 0.5)))


# combine sentiment data with nostalgia dictionary results
dat_dict_all <- left_join(dat_dict_test, dat_sent) |> 
    bind_cols(dat_test)

# get nostalgia base dictionary score and embeddings + base
dat_dict_all <- dat_dict_all |> 
    mutate(nostalgia_base_count = nostalgia.nostalgia_davalos + 
               nostalgia.nostalgia_handcoding + 
               nostalgia.nostalgia_extensions) |> 
    mutate(nostalgia_emb_count = nostalgia_base_count + nostalgia_emb)


# check whether sentence is coded as nostalgic using boolean approach
dat_dict_all <- dat_dict_all |> 
    mutate(nostalgia_base = ifelse(nostalgia_base_count > 0, 1, 0),
           nostalgia_base_and_emb = ifelse(nostalgia_emb_count > 0, 1, 0),
           nostalgia_base_sent = ifelse(nostalgia_base == 1 & sentiment > 0, 1, 0),
           nostalgia_emb_sent = ifelse(nostalgia_base_and_emb == 1 & sentiment > 0, 1, 0))


# continue with supervised classifiers

dfmat_train <- dfm(toks_train)
dfmat_test <- dfm(toks_test)


# train SVM model
tmod_svm <- textmodel_svm(dfmat_train, y = dfmat_train$nostalgic)

# match dfms since only features from training set can be considered in test set
dfmat_test_match <- dfm_match(dfmat_test, features = featnames(dfmat_train))


# predict (note that current version requires matched dfms)
pred_svm <- predict(tmod_svm, newdata = dfmat_test_match)

# add predictions to data frame as "nostalgia_svm"
dat_all <- dat_dict_all |> 
    mutate(nostalgia_svm = pred_svm)

# load DistilBERT predictions from 01b_fine-tune_distilbert.py
dat_bert <- read.csv("data_classified_bert_test.csv")

# merge results
dat_all_merged <- left_join(dat_all, dat_bert)


# calculate metrics for the six approaches
classifiers <- c("nostalgia_base_and_emb",
                 "nostalgia_base",
                 "nostalgia_base_sent",
                 "nostalgia_emb_sent",
                 "nostalgia_svm",
                 "nostalgic_bert")


# empty data frame for results
dat_metrics_all <- data.frame()

for (i in classifiers) {
    
    cat("\n")
    
    print(i)
    
    # create cross-table
    tab <- table(predicted = dat_all_merged[, i],
                 coded = dat_all_merged$nostalgic)
    
    print(tab)
    
    conf <- confusionMatrix(tab, positive = "1",
                            mode = "everything")
    
    
    # get performance metrics
    dat_metrics <- data.frame(
        Classifier = i,
        Accuracy = round(conf$overall["Accuracy"], 2),
        F1 = round(conf$byClass["F1"], 2),
        Precision = round(conf$byClass["Precision"], 2),
        Recall = round(conf$byClass["Recall"], 2)
        
    )
    
    print(dat_metrics)
    
    rownames(dat_metrics) <-  NULL
    
    dat_metrics_all <- bind_rows(dat_metrics, dat_metrics_all)
    
    
}

# tidy up data frame
dat_metrics_all <- dat_metrics_all |> 
    arrange(-F1) |> 
    mutate(Classifier = str_remove_all(Classifier, "nostalgia_")) |> 
    mutate(Classifier = dplyr::recode(Classifier, "svm" = "SVM", 
                                      "base" = "Dictionary",
                                      "base_sent" = "Dictionary + Positive Sentiment",
                                      "nb" = "Naive Bayes",
                                      "nostalgic_bert" = "DistilBERT",
                                      "emb_sent" = "Dictionary + Embeddings-Based Dictionary + Positive Sentiment",
                                      "base_and_emb" = "Dictionary + Embeddings-Based Dictionary"))

# Table A03 ----
print(xtable(dat_metrics_all,
             caption = "Classification performance of nostalgia for held-out test set",
             label="tab:performance"),
      type = "latex",
      size = "footnotesize",
      file="tab_a03.tex",
      include.rownames = FALSE,
      caption.placement = "top")


# overview if translation errors: remove English texts since they have not been machine translated
dat_translations_sum <- dat_3_joined |> 
    filter(!countryname %in% c("Ireland", "United Kingdom"))

# get percentage of incomprehensible sentences after excluding Ireland and UK
mean(dat_translations_sum$translation_at_least_2)
mean(dat_translations_sum$translation_at_least_3)


# create table with country-level results
dat_translations_sum_tab <- dat_translations_sum |> 
    group_by(countryname, nostalgic_at_least_3) |> 
    summarise(mean_wrong = paste0(100 * round(mean(translation_at_least_2), 2), "%")) |> 
    spread(nostalgic_at_least_3, mean_wrong) |> 
    rename(Country = countryname, 
           `Not Nostalgic` = `0`,
           Nostalgic = `1`)


## Table A04 ----
print(xtable(dat_translations_sum_tab,
             caption = "Incomprehensibly translated sentences (threshold: at least two out of four coders labelled translation of sentence as inaccurate)",
             digits = 3,
             label="tab:translation_accuracy"),
      type = "latex",
      size = "footnotesize",
      file="tab_a04.tex",
      include.rownames = FALSE,
      caption.placement = "top")



# dataset with one observation per manifesto
dat_manifestolevel_raw <- readRDS("data_nostalgia_manifestolevel.rds")

# get correlation coefficients for manifesto-level nostalgia

# select relevant variables
dat_nost <- dat_manifestolevel_raw |> 
    select(nostalgia_sentences_per_1000,
           nostalgia_sentences_per_1000_emb,
           nostalgia_sentences_per_1000_sentiment,
           nostalgia_sentences_per_1000_sentiment_emb,
           nostalgia_sentences_per_1000_svm,
           nostalgia_sentences_per_1000_bert)

names(dat_nost)

# rename column names
colnames(dat_nost) <-  c("Dictionary",
                         "Dictionary +\nEmbeddings",
                         "Dictionary +\nSentiment",
                         "Dictionary +\nEmbeddings +\nSentiment",
                         "SVM", 
                         "DistilBERT")

# calculate correlations
cors_nost <- cor(dat_nost)

cors_nost

# Figure A03 ----
ggcorrplot(cors_nost,
           show.diag = TRUE,
           lab_col = "grey20",
           colors = c("white", "white", "white"),
           lab = TRUE,
           lab_size = 4.5,
           ggtheme = theme_baser,
) +
    theme(legend.position = "none")
ggsave("fig_a03.pdf", width = 7, height = 5)


# compare surveys and text-based estimates
dat_countrylevel_10_20 <-  dat_manifestolevel_raw |> 
    mutate(year = as.numeric(substr(edate, 1, 4))) |> 
    filter(year >= 2010) |> 
    group_by(countryname) |> 
    summarise(mean_10_20 = mean(nostalgia_sentences_per_1000),
              mean_10_20_emb = mean(nostalgia_sentences_per_1000_emb),
              max_year = max(year)) 

# load PEW survey data
dat_survey <- read.csv("nostalgia_survey_pew.csv")

# merge data
dat_country_survey <- left_join(dat_survey,
                                dat_countrylevel_10_20,
                                by = "countryname")


# correlations
cor_better <- cor(dat_country_survey$life_better,
                  dat_country_survey$mean_10_20,
                  use = "pairwise.complete")

set.seed(355)
p_better <- ggplot(dat_country_survey, aes(x = life_better / 100, y = mean_10_20)) +
    geom_smooth(method = "lm", alpha = 0.3, colour = "grey50") +
    geom_point(size = 3, alpha = 0.8) +
    scale_x_continuous(labels = scales::percent_format(accuracy = 1),
                       breaks = c(seq(.20, .80, .20))) +
    scale_y_continuous(limits = c(5, 35)) +
    ggrepel::geom_label_repel(aes(label = countryname)) +
    ggplot2::annotate("text", x = 0.75, y = 30, 
                      label = paste0("r=", round(cor_better, 2)),
                      size = 5) +
    labs(x = "Agreement: Life is Better than 50 Years Ago",
         y = "Nostalgic Sentences (per 1,000 Sentences)")


cor_worse <- cor(dat_country_survey$life_worse,
                 dat_country_survey$mean_10_20,
                 use = "pairwise.complete")

set.seed(355)
p_worse <- ggplot(dat_country_survey, aes(x = life_worse / 100, y = mean_10_20)) +
    geom_smooth(method = "lm", alpha = 0.3, colour = "grey50") +
    geom_point(size = 3, alpha = 0.8) +
    scale_x_continuous(labels = scales::percent_format(accuracy = 1),
                       breaks = c(seq(.20, .80, .20))) +
    scale_y_continuous(limits = c(5, 35)) +
    ggrepel::geom_label_repel(aes(label = countryname)) +
    ggplot2::annotate("text", x = .65, y = 30, label = paste0("r=", round(cor_worse, 2)),
                      size = 5) +
    labs(x = "Agreement: Life is Worse than 50 Years Ago",
         y = "Nostalgic Sentences (per 1,000 Sentences)")


# Figure A04 ----
plot_grid(p_better, p_worse, nrow = 1)
ggsave("fig_a04.pdf",
       width = 9, height = 4.5)


# prepare data to compare cultural conservative measure
# across party families: CMP, ParlGov, CHES

# merge ParlGov data
dat_parlgov <- read.csv("parties_parlgov_2023.csv")

dat_parlgov_unique <- dat_parlgov |> 
    filter(party_id != "501") |> # exclude parties that appear twice - use coding that matches Manifesto Project
    filter(party_id != "373") |> 
    filter(party_id != "809") |> 
    filter(party_id != "1313") |> 
    mutate(party = as.character(cmp)) |> 
    select(party, family_name) |> 
    unique() |> 
    filter(!is.na(party)) |> 
    group_by(party)

nrow(dat_parlgov_unique)

length(unique((dat_parlgov_unique$party)))

# recode Christian Democracy to Christian Democratic
# and Social Democracy to Social Democratic
dat_parlgov_unique <- dat_parlgov_unique |> 
    mutate(family_name = str_replace_all(family_name, "democracy", "democratic"))

table(dat_parlgov_unique$family_name)

dat_manifestolevel <- dat_manifestolevel_raw |> 
    left_join(dat_parlgov_unique) 

# nrow should not be different after merging dataset
stopifnot(nrow(dat_manifestolevel) == nrow(dat_manifestolevel_raw))


# 3 manifestos have "to be coded" as party family
# these are all from the Slovenian National Party
dat_manifestolevel |> 
    filter(family_name == "to be coded") |> 
    select(manifesto_id, partyname, countryname)

# https://en.wikipedia.org/wiki/Slovenian_National_Party

# The Slovenian National Party (Slovene: Slovenska Nacionalna Stranka, SNS) 
# is a nationalist[13] political party in Slovenia led by Zmago Jelinčič Plemeniti.
# The party is known for its Euroscepticism and opposes Slovenia's membership in NATO.
                             # [14][15] It also engages in what many consider to be historical negationism of events 
# in Slovenia during World War II.[16]


# recode this party as "Right-wing"

dat_manifestolevel <- dat_manifestolevel |> 
    mutate(family_name = dplyr::recode(family_name, "to be coded" = "Right-wing"))


table(dat_manifestolevel$family_name)

# merge CHES party family data

dat_ches_raw <- read.csv("1999-2019_CHES_dataset_means(v2).csv")

# get party ID and relevant variables

dat_ches <- dat_ches_raw |> 
    select(party = cmp_id,
           family) |> 
    unique() |> 
    group_by(party) |> 
    mutate(n_included = n()) |> 
    sample_n(size = 1) |> 
    select(-n_included) |> 
    mutate(party = as.character(party))

# merge data
dat_manifestolevel <- dat_manifestolevel |> 
    left_join(dat_ches, by = "party")



# recode party family variables
dat_manifestolevel <- dat_manifestolevel |> 
    mutate(family_parlgov = 
               dplyr::recode(family_name,
                             "Special issue" = "Other",
                             "Agrarian" = "Other")) |> 
    mutate(family_parlgov = str_to_title(family_parlgov))




dat_manifestolevel <- dat_manifestolevel |> 
    mutate(family_ches = dplyr::recode(family,
                                       "agrarian/centre" = "Other",
                                       "christdem" = "Christian Democratic",
                                       "confessional" = "Other",
                                       "cons" = "Conservative",
                                       "green" = "Green Party",
                                       "liberal" = "Liberal",
                                       "no family" = "Other",
                                       "rad left" = "Radical Left",
                                       "rad right" = "Radical Right",
                                       "regionalist" = "Other",
                                       "socialist" = "Socialist"
    ))


dat_parfam_long <- dat_manifestolevel |> 
    filter(family_parlgov != "To Be Coded") |> 
    select(family_ches, party_family_recoded, family_parlgov,
           loglibcons, manifesto_id) |> 
    gather(measure_family, family, -c("loglibcons", "manifesto_id")) |> 
    filter(!is.na(family)) |> 
    mutate(measure_family = dplyr::recode(measure_family, "family_ches" = "Chapel Hill Expert Survey",
                                          "family_parlgov" = "ParlGov",
                                          "party_family_recoded" = "Manifesto Project")) |> 
    mutate(measure_family = factor(measure_family, levels = c("Manifesto Project",
                                                              "ParlGov",
                                                              "Chapel Hill Expert Survey")))

## Figure A05 ----
ggplot(dat_parfam_long, aes(x = reorder(family, loglibcons),
                            y = loglibcons)) +
    geom_boxplot(outlier.color = "white") +
    ggbeeswarm::geom_quasirandom(alpha = 0.1) +
    coord_flip() +
    facet_wrap(~measure_family, scales = "free_y", nrow = 3) +
    scale_y_continuous(breaks = c(seq(-5, 5, 1))) +
    labs(x = NULL, y = "Cultural Conservatism")
ggsave("fig_a05.pdf",
       width = 9, height = 10)


# get party ID and relevant variables from CHES
dat_ches <- dat_ches_raw |> 
    select(party = cmp_id,
           ches_year = year, ches_lr_econ = lrecon,
           ches_lr_gen = lrgen, 
           ches_galtan = galtan)


# get relevant data from manifestos
dat_manifestos <- dat_manifestolevel_raw |> 
    select(loglibcons, loglibcons_base,
           stateconomy,
           countryname, edate, year,
           party,
           rile, logrile)


dat_ches$party <- as.character(dat_ches$party)
dat_manifestos$party <- as.character(dat_manifestos$party)

dat_joined <- left_join(dat_manifestos, 
                        dat_ches, by = "party",
                        relationship = "many-to-many")

nrow(dat_joined)

# get difference between year (election year) and ches_year (survey year)

dat_joined <- dat_joined |> 
    mutate(diff_years = abs(year - ches_year)) |> 
    arrange(countryname, party, diff_years, -year) |> 
    group_by(countryname, party, edate) |> 
    mutate(number = 1:n()) |> 
    filter(number == 1) |> # select closest survey
    filter(diff_years < 4) # select only differences of fewer than 4 yeras

nrow(dat_joined)

# 1996-2018
min(dat_joined$year)
max(dat_joined$year)

dat_joined <- dat_joined |> 
    mutate(ches_galtan = as.numeric(ches_galtan)) |> 
    mutate(ches_lr_econ = as.numeric(ches_lr_econ)) |> 
    mutate(ches_lr_gen= as.numeric(ches_lr_gen)) |> 
    filter(!is.na(ches_year)) |> 
    ungroup() |> 
    mutate(ches_decade = paste0("Decade: ", substr(ches_year, 1,3), "0s")) |> 
    ungroup() 

# calculate decade-level correlations
dat_cors_ches <- dat_joined |> 
    group_by(ches_decade) |> 
    summarise(cor = paste0("r=", 
                           format(round(cor(ches_galtan, loglibcons,
                                            use = "pairwise.complete.obs"), 2), nsmall = 2)))

# Figure A6 ----
ggplot(dat_joined, aes(x = loglibcons, 
                       y = ches_galtan)) +
    geom_smooth(colour = "grey50", method = "lm") +
    geom_point(alpha = 0.3, size = 2.5, shape = 16) + 
    facet_wrap(~ches_decade, nrow = 1) +
    geom_text(data = dat_cors_ches, aes(label = cor),
              x = -3, y = 9,
              inherit.aes = FALSE,
              size = 5) +
    labs(x = "Cultural Conservatism",
         y = "GAL/TAN (Chapel Hill Expert Survey)")
ggsave("fig_a06.pdf", 
       width = 9, height = 4)
